This notebook looks explores the data from the DSRCT sample set,
SCPCP000013. We then see if we can use expression of
DSRCT-specific genes to manually classify tumor and normal cells. The
main goal of this notebook is only to identify tumor cells,
identification and labeling of the other cells is a separate question
that we do not answer here.
suppressPackageStartupMessages({
# load required packages
library(SingleCellExperiment)
library(ggplot2)
})
## Warning: package 'GenomicRanges' was built under R version 4.4.1
## Warning: package 'S4Vectors' was built under R version 4.4.1
## Warning: package 'IRanges' was built under R version 4.4.1
# Set default ggplot theme
theme_set(
theme_bw()
)
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current")
# sample_dir <- file.path(data_dir, "SCPCP000013", params$sample_id)
# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-DSRCT")
metadata_file <- file.path(data_dir, "SCPCP000013", "single_cell_metadata.tsv")
metadata <- read.csv(metadata_file, sep = "\t")
metadata_DSRCT <- dplyr::filter(metadata, diagnosis == "Desmoplastic small round cell tumor")
sce_file_list <- file.path(
data_dir, "SCPCP000013",
metadata_DSRCT$scpca_sample_id,
paste0(metadata_DSRCT$scpca_library_id, "_processed.rds")
)
marker_genes <- file.path(module_base, "references", "tumor-marker-genes.tsv")
# output tumor/normal classifications
results_dir <- file.path(module_base, "results", "marker_gene_analysis")
fs::dir_create(results_dir)
# classifications_filename <- glue::glue("{params$library_id}_tumor_normal_classifications.tsv")
# output_classifications_file <- file.path(results_dir, classifications_filename)
Read in each data as a separate object
SingleCellExperiment in the list.
sce_list <- lapply(sce_file_list, readr::read_rds)
# adding the sample id to the name of each SingleCellExperiment
names(sce_list) <- metadata_DSRCT$scpca_library_id
# read in marker genes table
marker_genes_df <- readr::read_tsv(marker_genes) |>
# account for genes being from multiple sources
dplyr::select(cell_type, ensembl_gene_id, gene_symbol) |>
dplyr::distinct()
## Rows: 7 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): cell_type, gene_symbol, ensembl_gene_id, source
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
marker_genes_df
## # A tibble: 7 × 3
## cell_type ensembl_gene_id gene_symbol
## <chr> <chr> <chr>
## 1 tumor ENSG00000117069 ST6GALNAC5
## 2 tumor ENSG00000007402 CACNA2D2
## 3 tumor ENSG00000283154 IQCJ-SCHIP1
## 4 tumor ENSG00000139304 PTPRQ
## 5 tumor ENSG00000069482 GAL
## 6 tumor ENSG00000197487 GALP
## 7 tumor ENSG00000165474 GJB2
marker_genes <- marker_genes_df |>
dplyr::filter(cell_type == "tumor") |>
dplyr::pull(ensembl_gene_id)
The first thing we do here is just create a faceted UMAP showing the expression of each marker gene for tumor cells.
umap_df_list <- lapply(sce_list, function(x) {
# pull out the UMAP coordinates and genes and make a data frame to use for plotting
umap_df <- x |>
scuttle::makePerCellDF(features = marker_genes, use.dimred = "UMAP") |>
# replace UMAP.1 with UMAP1
dplyr::rename_with(\(x) stringr::str_replace(x, "^UMAP\\.", "UMAP")) |>
# combine all genes into a single column for easy faceting
tidyr::pivot_longer(
cols = starts_with("ENSG"),
names_to = "ensembl_gene_id",
values_to = "gene_expression"
) |>
# join with marker gene df to get gene symbols for plotting
dplyr::left_join(marker_genes_df, by = c("ensembl_gene_id")) |>
dplyr::select(
barcodes,
UMAP1,
UMAP2,
gene_symbol,
ensembl_gene_id,
gene_expression,
cluster
)
})
for (i in 1:length(umap_df_list)) {
# faceted umap showing a umap panel for each marker gene
p <- ggplot(umap_df_list[[i]], aes(x = UMAP1, y = UMAP2, color = gene_expression)) +
geom_point(alpha = 0.8, size = 0.1) +
facet_wrap(vars(gene_symbol)) +
scale_color_viridis_c() +
labs(color = "Log-normalized gene expression") +
# remove axis numbers and background grid
scale_x_continuous(labels = NULL, breaks = NULL) +
scale_y_continuous(labels = NULL, breaks = NULL) +
theme(
aspect.ratio = 1,
legend.position = "bottom",
axis.title = element_text(size = 9, color = "black"),
strip.text = element_text(size = 8),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8)
) +
guides(colour = guide_colorbar(title.position = "bottom", title.hjust = 0.5)) +
ggtitle(names(umap_df_list)[i])
# save plots
ggsave(
filename = paste0(
module_base,
"/plots/marker_expression_",
names(umap_df_list)[i],
".png"
),
plot = p,
width = 6,
height = 6,
units = "in",
dpi = 150
)
}
In my experience, ST6GALNAC5 is a strong marker for
DSRCT in single-cell data. As can be seen, several of the samples have
expression of ST6GALNAC5, but some do not, such as
SCPCS000731 and SCPCS000729. In fact, these SCPCS000729 contains low
number of cells. These samples in particular are the PDX samples and I
will be excluding SCPCS000729 from further analyses.
umap_df_list_excluded <- umap_df_list[!(names(umap_df_list) %in% c("SCPCS000731"))]
We can also look at the distributions for each marker gene. I would
expect to see some sort of bimodal distribution separating cells that do
and do not have expression of the marker gene. What is clear from these
plots that ST6GALNAC5, CACNA2D2,
PTPRQ, IQCJ-SCHIP1, show a bi modal
distribution among the cells. To some extent, we do see expression of
the other markers.
for (i in 1:length(umap_df_list_excluded)) {
p <- ggplot(
umap_df_list_excluded[[i]],
aes(x = gene_expression, fill = gene_symbol)
) +
geom_density() +
facet_wrap(vars(gene_symbol), scales = "free_y") +
theme(legend.position = "none") +
ggtitle(names(umap_df_list_excluded)[i])
print(p)
# save plots
ggsave(
filename = paste0(
module_base,
"/plots/marker_distribution_",
names(umap_df_list)[i],
".png"
),
plot = p,
width = 6,
height = 6,
units = "in",
dpi = 150
)
}
Although we see some slight semblance of a bimodal distribution for most marker genes, it is hard to see and we cannot directly compare gene expression values across genes. This would make it hard to identify a cut off to categorize cells as expression or not expressing the marker gene.
Now we will transform each of the gene expression vectors by generating z-scores. Then we might be able to find a cut off we can use across samples for if marker genes are present in a cell or not.
umap_df_list_excluded_scaled <- lapply(umap_df_list_excluded, function(x) {
x |>
dplyr::group_by(gene_symbol) |>
# get z-scores for each gene
dplyr::mutate(transformed_gene_expression = scale(gene_expression)[, 1]) |>
dplyr::ungroup()
})
Now we can create the same density plot but looking at z-scores.
Interestingly, in some samples, like SCPCS000489, we see that
ST6GALNAC5 has negative values. Presumably, these are tumor
cells but there may be a skew due to the number of tumor cells compared
to normal.
for (i in 1:length(umap_df_list_excluded_scaled)) {
p <- ggplot(
umap_df_list_excluded_scaled[[i]],
aes(x = transformed_gene_expression, fill = gene_symbol)
) +
geom_density() +
facet_wrap(vars(gene_symbol), scales = "free_y") +
theme(legend.position = "none") +
ggtitle(names(umap_df_list_excluded_scaled)[i])
print(p)
# save plots
ggsave(
filename = paste0(
module_base,
"/plots/marker_distribution_zscaled_",
names(umap_df_list)[i],
".png"
),
plot = p,
width = 6,
height = 6,
units = "in",
dpi = 150
)
}
## Warning: Removed 1600 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 1600 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 3826 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 3826 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 276 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 276 rows containing non-finite outside the scale range
## (`stat_density()`).
It looks like some marker genes have distinct groups of cells with z-score > 0, while other marker genes may not be as informative.
Let’s try and use the marker gene expression to classify tumor cells. It looks like we could use a cutoff of z-score > 0 to count that cell as a tumor cell.
We could either count any cell that expresses at least one marker gene > 0 as a tumor cell, or look at the combined expression. Let’s start with classifying tumor cells as tumor if any marker gene is present (z-score > 0).
Below, we can get the sum of the transformed gene expression of all marker genes and plot in a single UMAP.
# calculate sum gene expression across all marker genes in list
marker_sum_exp <- lapply(umap_df_list_excluded_scaled, function(x) {
x |>
dplyr::group_by(barcodes) |>
dplyr::mutate(sum_exp = sum(transformed_gene_expression, na.rm = T)) |>
dplyr::select(barcodes, UMAP1, UMAP2, sum_exp, cluster) |>
dplyr::distinct()
})
# plot mean gene expression
for (i in 1:length(marker_sum_exp)) {
p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_exp)) +
geom_point(size = 0.5, alpha = 0.5) +
scale_color_viridis_c() +
ggtitle(names(umap_df_list_excluded_scaled)[i])
print(p)
# save plots
ggsave(
filename = paste0(
module_base,
"/plots/UMAP_marker_expression_",
names(umap_df_list)[i],
".png"
),
plot = p,
width = 6,
height = 6,
units = "in",
dpi = 150
)
}
Similar to the individual plots, it looks like there is one group of
cells on the bottom right that has the highest marker gene expression.
We would anticipate that these are most likely to be the tumor
cells.
Now let’s classify any cell that has a sum of marker genes > 0 (after z-transformation) as tumor cells.
# classify tumor cells based on presence of any marker genes
marker_sum_exp <- lapply(marker_sum_exp, function(x) {
x |>
dplyr::mutate(sum_classification = dplyr::if_else(sum_exp > 0, "Tumor", "Normal"))
})
for (i in 1:length(marker_sum_exp)) {
p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_classification)) +
geom_point(size = 0.5, alpha = 1) +
ggtitle(names(umap_df_list_excluded_scaled)[i])
print(p)
# save plots
ggsave(
filename = paste0(
module_base,
"/plots/UMAP_classification_",
names(umap_df_list)[i],
".png"
),
plot = p,
width = 6,
height = 6,
units = "in",
dpi = 150
)
}
This gives us a rough idea of cells that may be classified as tumor cells. However, I believe re-doing this analysis using the cluster level data may give us better results. This is highly dependent on whether the clusters were generated with enough granularity.
# calculate sum gene expression across all marker genes in list
marker_sum_exp <- lapply(umap_df_list_excluded_scaled, function(x) {
x |>
dplyr::group_by(cluster) |> # change to cluster
dplyr::mutate(sum_exp = sum(transformed_gene_expression, na.rm = T)) |>
dplyr::select(barcodes, UMAP1, UMAP2, sum_exp, cluster) |>
dplyr::distinct()
})
# plot mean gene expression
for (i in 1:length(marker_sum_exp)) {
p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_exp)) +
geom_point(size = 0.5, alpha = 0.5) +
scale_color_viridis_c() +
ggtitle(names(umap_df_list_excluded_scaled)[i])
print(p)
# save plots
ggsave(
filename = paste0(module_base, "/plots/UMAP_marker_expression_cluster_", names(umap_df_list)[i], ".png"),
plot = p,
width = 6,
height = 6,
units = "in",
dpi = 150
)
}
It looks like there are group of cells with high marker gene expression.
We anticipate that these are most likely to be the tumor cells. The
groups with low or negative values are likely normal cells.
Now let’s classify clusters that has a sum of marker genes > 0 (after z-transformation) as tumor cell clusters.
# classify tumor cells based on presence of any marker genes
marker_sum_exp <- lapply(marker_sum_exp, function(x) {
d <- density(x$sum_exp)
x |>
dplyr::mutate(sum_classification = dplyr::if_else(sum_exp > 0, "Tumor", "Normal"))
})
for (i in 1:length(marker_sum_exp)) {
p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_classification)) +
geom_point(size = 0.5, alpha = 1) +
ggtitle(names(umap_df_list_excluded_scaled)[i])
print(p)
# save plots
ggsave(
filename = paste0(
module_base,
"/plots/UMAP_classification_cluster_",
names(umap_df_list)[i],
".png"
),
plot = p,
width = 6,
height = 6,
units = "in",
dpi = 150
)
}
Based on my experiences, the above plots show that we are unable to adequately determine the tumor cells from the normal cells. This can be seen in SCPCS000729, which is most likely all tumor cells, since it is collected from a patient-derived xenograft.
ST6GALNAC5, PTRPQ, and
IQCJ-SCHIP1 are good markers for the DSRCT cells within
this data. The other markers are not as highly expressed either due to
heterogeneity or data quality.SingleR or
CellAssign to identify the normal cells and then work to
identify remaining cells as tumor cells. Normal cells may have more
definitive markers than tumor cells.# get an RDS of the processed data
saveRDS(sce_file_list, "../results/SCPCP000013_sce_file_list.rds")
# get an RDS of the UMAP data with marker genes
saveRDS(umap_df_list, file.path(module_base, "results/SCPCP000013_umap_df_list.rds"))
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggplot2_3.5.1 SingleCellExperiment_1.26.0
## [3] SummarizedExperiment_1.34.0 Biobase_2.64.0
## [5] GenomicRanges_1.56.2 GenomeInfoDb_1.40.1
## [7] IRanges_2.38.1 S4Vectors_0.42.1
## [9] BiocGenerics_0.50.0 MatrixGenerics_1.16.0
## [11] matrixStats_1.4.1
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.5 xfun_0.48
## [3] bslib_0.8.0 lattice_0.22-6
## [5] tzdb_0.4.0 vctrs_0.6.5
## [7] tools_4.4.0 generics_0.1.3
## [9] parallel_4.4.0 tibble_3.2.1
## [11] fansi_1.0.6 highr_0.11
## [13] pkgconfig_2.0.3 Matrix_1.7-0
## [15] sparseMatrixStats_1.16.0 lifecycle_1.0.4
## [17] GenomeInfoDbData_1.2.12 farver_2.1.2
## [19] stringr_1.5.1 compiler_4.4.0
## [21] munsell_0.5.1 codetools_0.2-20
## [23] htmltools_0.5.8.1 sass_0.4.9
## [25] yaml_2.3.10 pillar_1.9.0
## [27] crayon_1.5.3 jquerylib_0.1.4
## [29] tidyr_1.3.1 BiocParallel_1.38.0
## [31] DelayedArray_0.30.1 cachem_1.1.0
## [33] abind_1.4-8 tidyselect_1.2.1
## [35] digest_0.6.37 stringi_1.8.4
## [37] purrr_1.0.2 dplyr_1.1.4
## [39] labeling_0.4.3 rprojroot_2.0.4
## [41] fastmap_1.2.0 grid_4.4.0
## [43] colorspace_2.1-1 cli_3.6.3
## [45] SparseArray_1.4.8 magrittr_2.0.3
## [47] S4Arrays_1.4.1 utf8_1.2.4
## [49] readr_2.1.5 withr_3.0.1
## [51] DelayedMatrixStats_1.26.0 scales_1.3.0
## [53] UCSC.utils_1.0.0 bit64_4.5.2
## [55] rmarkdown_2.28 XVector_0.44.0
## [57] httr_1.4.7 bit_4.5.0
## [59] hms_1.1.3 beachmat_2.20.0
## [61] evaluate_1.0.1 knitr_1.48
## [63] viridisLite_0.4.2 rlang_1.1.4
## [65] Rcpp_1.0.13 scuttle_1.14.0
## [67] glue_1.8.0 vroom_1.6.5
## [69] jsonlite_1.8.9 R6_2.5.1
## [71] fs_1.6.4 zlibbioc_1.50.0